In [97]:
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly
import plotly.express as px
import seaborn as sns
from feature_engine.selection import SmartCorrelatedSelection
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

import polars as pl

# Path needs to be added manually to read from another folder
path2add = os.path.normpath(
    os.path.abspath(os.path.join(os.path.dirname("__file__"), os.path.pardir, "utils"))
)
if not (path2add in sys.path):
    sys.path.append(path2add)

from feature_engineering import aggregate_node_features, get_graph_features
In [3]:
%load_ext autoreload
%autoreload 2
In [4]:
plotly.offline.init_notebook_mode()

Data¶

We will import the supervised_call_graphs.json file to see if there are any features that can be engineered, because the clean_data_supervised.parquet file already contains descriptive features. This json file contains graph data, so before this data can be analyzed, the data must be processed.

In [5]:
apidf = pl.read_parquet('../data/clean_data_supervised.parquet')
callsdf = pl.read_json('../data/supervised_call_graphs.json')
In [6]:
callsdf.head()
Out[6]:
shape: (5, 2)
_idcall_graph
strlist[struct[2]]
"1f2c32d8-2d6e-3b68-bc46-789469…[{"1f873432-6944-3df9-8300-8a3cf9f95b35","5862055b-35a6-316a-8e20-3ae20c1763c2"}, {"8955faa9-0e33-37ad-a1dc-f0e640a114c2","a4fd6415-1fd4-303e-aa33-bb1830b5d9d4"}, … {"016099ea-6f20-3fec-94cf-f7afa239f398","6fa8ad53-2f0d-3f44-8863-139092bfeda9"}]
"4c486414-d4f5-33f6-b485-24a8ed…[{"016099ea-6f20-3fec-94cf-f7afa239f398","946e3ced-48a5-3de5-ad5a-1d20b1ab7eb5"}, {"a05a261f-128d-3cd8-a8e1-d6e52e161947","375c16ea-5f8d-32d5-8893-639d9b3a53d6"}, … {"68acdde8-bd53-39d1-9be0-fd67a281d7be","d7a53acc-eb6e-3f6c-b72e-9aefb54dd311"}]
"7e5838fc-bce1-371f-a3ac-d8a0b2…[{"1f873432-6944-3df9-8300-8a3cf9f95b35","5862055b-35a6-316a-8e20-3ae20c1763c2"}, {"857c4b20-3057-30e0-9ca3-d6f5c3dbe4a6","857c4b20-3057-30e0-9ca3-d6f5c3dbe4a6"}, … {"016099ea-6f20-3fec-94cf-f7afa239f398","6fa8ad53-2f0d-3f44-8863-139092bfeda9"}]
"82661ecd-d87f-3dff-855e-378f7c…[{"47896677-7c81-381f-8d03-3b2c94a27fdc","8244a4e7-5f5e-384e-b6a1-d6f065cecb11"}, {"089d44f6-bdf6-3a42-886a-db8e427fd2e0","756ab2fe-a386-32dd-9a4e-18785c38a414"}, … {"22d3028b-b12a-34d7-b641-886ab54ae6ff","22d3028b-b12a-34d7-b641-886ab54ae6ff"}]
"d62d56ea-775e-328c-8b08-db7ad7…[{"876b4958-7df1-3b2b-9def-1a22f1d444e3","aadf8ca9-ffda-30f0-bacf-2203e80c0811"}, {"cb8ef584-d1ad-3d44-a328-792f6556c23f","be305dea-fec3-3b4d-92b4-6cb88038c4cc"}, … {"d7d9c8e7-fbe2-3195-a903-20ab61c63de7","0fd98078-d00b-36c2-b067-79baa4e93068"}]

Pre-process graph data¶

In [7]:
calls_processed = (
    callsdf.with_columns(
        pl.col("call_graph").list.eval(
            pl.element().struct.rename_fields(["from", "to"])
        )
    )
    .explode("call_graph")
    .unnest("call_graph")
)

calls_processed.head()
Out[7]:
shape: (5, 3)
_idfromto
strstrstr
"1f2c32d8-2d6e-3b68-bc46-789469…"1f873432-6944-3df9-8300-8a3cf9…"5862055b-35a6-316a-8e20-3ae20c…
"1f2c32d8-2d6e-3b68-bc46-789469…"8955faa9-0e33-37ad-a1dc-f0e640…"a4fd6415-1fd4-303e-aa33-bb1830…
"1f2c32d8-2d6e-3b68-bc46-789469…"85754db8-6a55-30b7-8558-dec75f…"85754db8-6a55-30b7-8558-dec75f…
"1f2c32d8-2d6e-3b68-bc46-789469…"9f08fee1-953c-3801-b254-c0256f…"876b4958-7df1-3b2b-9def-1a22f1…
"1f2c32d8-2d6e-3b68-bc46-789469…"857c4b20-3057-30e0-9ca3-d6f5c3…"857c4b20-3057-30e0-9ca3-d6f5c3…

Feature Engineering¶

We observe that each graph has a separate _id that can be later used to join to the main dataset. A graph consists of source and destination nodes which refer to the available API calls.

Basic Graph Level Features¶

The most basic graph-level that we can engineer are:

  • Number of edges (connections)
  • Number of nodes (APIs)

These features could be useful since most behaviors are going to have a "normal" range of APIs that they contact. If this number is too large or too small, this might be an indication of anomalous activity.

In [8]:
graph_features = calls_processed.group_by('_id').agg(
    pl.len().alias('n_connections'),
    pl.col('from'),
    pl.col('to')
).with_columns(
    pl.concat_list('from', 'to').list.unique().list.len().alias('n_unique_nodes')
).select([
    '_id',
    'n_connections',
    'n_unique_nodes'
])

graph_features.sample(3)
Out[8]:
shape: (3, 3)
_idn_connectionsn_unique_nodes
stru32u32
"0cdc6111-dc79-32d5-a221-5ba3df…2820
"cf3ebfff-a72b-3958-b863-e85de3…1510
"2e18a413-5ebf-3c4e-9dc5-47cfed…24269

Node Level Features¶

Since graphs consist of nodes, we can engineer a set of features around specific nodes (APIs). We can calculate:

  • Node degrees - the number of edges that come from/into a node. Very highly connected nodes can look anomalous
  • Node centrality - there are various centrality measures (e.g. Page Rank) but they all try to estimate how important to the whole graph is a specific node. This feature could be useful because a behavior pattern that doesn't touch any of the "central" APIs would look anomalous

These features can be broken down into:

  • global features - measure node attributes across all the graphs
  • local features - measure node attributes across a specific graph
In [9]:
calls_processed = calls_processed.with_columns(
    global_source_degrees = pl.len().over(pl.col('from')),
    global_dest_degrees = pl.len().over(pl.col('to')),
    local_source_degrees = pl.len().over(pl.col('from'), pl.col('_id')),
    local_dest_degrees = pl.len().over(pl.col('to'), pl.col('_id'))
)

calls_processed.sample(3)
Out[9]:
shape: (3, 7)
_idfromtoglobal_source_degreesglobal_dest_degreeslocal_source_degreeslocal_dest_degrees
strstrstru32u32u32u32
"8a989644-d121-315b-8206-deae98…"d68e78ab-d01a-35b5-b816-7d715c…"1d768e1f-ee4c-3486-9263-432754…375103578
"3bcc9249-8bf7-3622-b0ca-1c6b26…"756ab2fe-a386-32dd-9a4e-18785c…"699caece-830f-3194-ae24-7b4563…6808711246
"74931360-c3a3-353a-8dfc-72030c…"dab3f781-7cce-3286-a667-c9f295…"756ab2fe-a386-32dd-9a4e-18785c…240622416231

Now that the node-level features are calculated, we need to aggregate them for a specific graph (_id). When aggregating, we can calculate average, std, min, and max statistics for every feature to capture the distribution well.

In [10]:
node_features_agg = aggregate_node_features(
    calls_processed,
    node_features=[
        "global_source_degrees",
        "global_dest_degrees",
        "local_source_degrees",
        "local_dest_degrees",
    ],
    by="_id",
)

graph_features = graph_features.join(node_features_agg, on="_id")
In [11]:
graph_features.head()
Out[11]:
shape: (5, 19)
_idn_connectionsn_unique_nodesavg_global_source_degreesmin_global_source_degreesmax_global_source_degreesstd_global_source_degreesavg_global_dest_degreesmin_global_dest_degreesmax_global_dest_degreesstd_global_dest_degreesavg_local_source_degreesmin_local_source_degreesmax_local_source_degreesstd_local_source_degreesavg_local_dest_degreesmin_local_dest_degreesmax_local_dest_degreesstd_local_dest_degrees
stru32u32f64u32u32f64f64u32u32f64f64u32u32f64f64u32u32f64
"be3aae78-b140-37f9-af2d-da867f…170378085.788235333320718264.7506079182.858824474224167620.3436928.3176471164.5578778.1529411184.706802
"66561100-a361-3636-a81b-342d55…5330223.5660386596175.431612479.58490651151408.4670892.54717151.3807195.1132081103.256121
"73e1fe84-da14-309e-8783-0da801…104488211.971154111320719959.34161710563.75961580224169094.5043874.6730771144.4752666.8461541166.07201
"b49d6f27-2c83-3fa3-9646-5a3573…7542218.22596201.588623346.70666771151368.7080363.08182.2586774.4133331113.556848
"42752144-40ff-3abc-b88c-adf61f…3324848.666667104043207112509.44828314065.012172048911126.6943881.666667120.577351.666667120.57735

Feature Selection¶

Feature selection will be done using 2 steps:

  1. Quality checks - if the feature is constant or has too many missing values (>= 95%) it will be dropped
  2. Correlation analysis - if features have very high correlation (>= 95%) with each other, they can be dropped as well
In [12]:
engineered_features = graph_features.columns[1:]
engineered_features
Out[12]:
['n_connections',
 'n_unique_nodes',
 'avg_global_source_degrees',
 'min_global_source_degrees',
 'max_global_source_degrees',
 'std_global_source_degrees',
 'avg_global_dest_degrees',
 'min_global_dest_degrees',
 'max_global_dest_degrees',
 'std_global_dest_degrees',
 'avg_local_source_degrees',
 'min_local_source_degrees',
 'max_local_source_degrees',
 'std_local_source_degrees',
 'avg_local_dest_degrees',
 'min_local_dest_degrees',
 'max_local_dest_degrees',
 'std_local_dest_degrees']

Quality checks¶

In [13]:
null_counts = graph_features.null_count().transpose(include_header=True, header_name='col', column_names=['null_count'])
null_counts.filter(pl.col('null_count') > 0)
Out[13]:
shape: (4, 2)
colnull_count
stru32
"std_global_source_degrees"42
"std_global_dest_degrees"42
"std_local_source_degrees"42
"std_local_dest_degrees"42
In [14]:
static_features = graph_features.select(engineered_features).std().transpose(include_header=True, header_name='col', column_names=['std'])
static_features.filter(pl.col('std') == 0)
Out[14]:
shape: (0, 2)
colstd
strf64
Observations¶
  • 4 columns have missing values. All of them calculate standard deviation
Impact¶
  • No features will be dropped for quality reasons

Correlation Analysis¶

In [15]:
feature_corrs = graph_features.select(engineered_features).to_pandas().dropna().corr()
feature_corrs.index = feature_corrs.columns
matrix = np.triu(feature_corrs)
fig = plt.figure(figsize=(20, 10))
sns.heatmap(feature_corrs, annot=True, mask=matrix)
Out[15]:
<Axes: >
No description has been provided for this image

There are groups of highly correlated features. Applying SmartCorrelatedSelection should reduce the feature set of engineered features.

In [16]:
features_pd = graph_features.select(engineered_features).to_pandas().dropna()

tr = SmartCorrelatedSelection(
    variables=None,
    method="pearson",
    threshold=0.95,
    missing_values="raise",
    selection_method="variance",
    estimator=None,
)

tr.fit(features_pd)

print('Features to drop:')
for f in tr.features_to_drop_:
    print(f)
Features to drop:
std_global_dest_degrees
n_unique_nodes
max_local_source_degrees
max_local_dest_degrees
std_local_dest_degrees
avg_local_dest_degrees
avg_local_source_degrees
Observations¶
  • The engineered features have groups of high correlation
Impact¶
  • std_global_dest_degrees, n_unique_nodes, max_local_source_degrees, max_local_dest_degrees, std_local_dest_degrees, avg_local_dest_degrees, avg_local_source_degrees will be dropped due to having high correlations and low variances compared with the other features

EDA for Remaining Engineered Features¶

In [17]:
remaining_engineered_features = list(set(features_pd).difference(set(tr.features_to_drop_)))
graph_features_merged = graph_features.join(apidf.select(['_id', 'is_anomaly']),  on='_id')
In [18]:
remaining_engineered_features
Out[18]:
['std_local_source_degrees',
 'min_local_source_degrees',
 'avg_global_source_degrees',
 'max_global_dest_degrees',
 'max_global_source_degrees',
 'std_global_source_degrees',
 'min_local_dest_degrees',
 'min_global_source_degrees',
 'avg_global_dest_degrees',
 'n_connections',
 'min_global_dest_degrees']
In [19]:
graph_features_merged.head()
Out[19]:
shape: (5, 20)
_idn_connectionsn_unique_nodesavg_global_source_degreesmin_global_source_degreesmax_global_source_degreesstd_global_source_degreesavg_global_dest_degreesmin_global_dest_degreesmax_global_dest_degreesstd_global_dest_degreesavg_local_source_degreesmin_local_source_degreesmax_local_source_degreesstd_local_source_degreesavg_local_dest_degreesmin_local_dest_degreesmax_local_dest_degreesstd_local_dest_degreesis_anomaly
stru32u32f64u32u32f64f64u32u32f64f64u32u32f64f64u32u32f64bool
"1f2c32d8-2d6e-3b68-bc46-789469…28214474055.6650123320716840.7197154547.6299182224166567.76252925.519674112630.88907330.768167116440.924937false
"4c486414-d4f5-33f6-b485-24a8ed…12702805174.5267723320717527.9707545858.02362212224167074.26566616.27086617619.45824218.71023616520.34814false
"7e5838fc-bce1-371f-a3ac-d8a0b2…15893544174.3694153320717048.3854214814.5173062224166676.39863414.92196317917.70099319.62303319324.258697false
"82661ecd-d87f-3dff-855e-378f7c…4591145867.78649212320717153.5803216689.27668810224166900.94889910.0631811277.32125710.7559911288.03984false
"d62d56ea-775e-328c-8b08-db7ad7…89236914.842697533207110320.0045815613.4157338224167792.4105215.9213481123.1630855.157303192.250752false
In [20]:
remaining_features_df = graph_features_merged.select(['_id'] + remaining_engineered_features + ['is_anomaly'])
remaining_features_df.head()
Out[20]:
shape: (5, 13)
_idstd_local_source_degreesmin_local_source_degreesavg_global_source_degreesmax_global_dest_degreesmax_global_source_degreesstd_global_source_degreesmin_local_dest_degreesmin_global_source_degreesavg_global_dest_degreesn_connectionsmin_global_dest_degreesis_anomaly
strf64u32f64u32u32f64u32u32f64u32u32bool
"1f2c32d8-2d6e-3b68-bc46-789469…30.88907314055.66501222416320716840.719715134547.62991828212false
"4c486414-d4f5-33f6-b485-24a8ed…19.45824215174.52677222416320717527.970754135858.023622127012false
"7e5838fc-bce1-371f-a3ac-d8a0b2…17.70099314174.36941522416320717048.385421134814.51730615892false
"82661ecd-d87f-3dff-855e-378f7c…7.32125715867.78649222416320717153.5803211126689.27668845910false
"d62d56ea-775e-328c-8b08-db7ad7…3.16308516914.842697224163207110320.0045811535613.415738938false
In [21]:
remaining_features_df.columns[1:-1]
Out[21]:
['std_local_source_degrees',
 'min_local_source_degrees',
 'avg_global_source_degrees',
 'max_global_dest_degrees',
 'max_global_source_degrees',
 'std_global_source_degrees',
 'min_local_dest_degrees',
 'min_global_source_degrees',
 'avg_global_dest_degrees',
 'n_connections',
 'min_global_dest_degrees']
In [22]:
remaining_features_df.select(pl.corr('min_local_dest_degrees', 'is_anomaly')).item()
Out[22]:
0.1871244054310182
In [23]:
for col in remaining_features_df.columns[1:-1]:
    correl = remaining_features_df.select(pl.corr(col, "is_anomaly")).item()
    print(f'{col}: {correl}')
std_local_source_degrees: -0.4599070276100464
min_local_source_degrees: 0.17967814928163506
avg_global_source_degrees: 0.019838141497681823
max_global_dest_degrees: -0.21899813519545483
max_global_source_degrees: -0.18965286839023868
std_global_source_degrees: -0.024380534174284717
min_local_dest_degrees: 0.1871244054310182
min_global_source_degrees: 0.19599078710208603
avg_global_dest_degrees: -0.13069273069548185
n_connections: -0.3707623919264986
min_global_dest_degrees: 0.3039360455504536

Feature Importances¶

A Random Forest will be trained on the engineered features to determine feature importances. Because several of these features have high cardinality and impurity-based importances are biased towards this, I will also determine permutation importances.

In [47]:
X = remaining_features_df.select(remaining_features_df.columns[1:-1]).to_numpy()
y = remaining_features_df.select('is_anomaly').to_numpy().ravel()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state = 0)
In [74]:
rf = RandomForestClassifier(n_estimators=100, min_samples_leaf=20, random_state=0)
rf.fit(X_train, y_train)
Out[74]:
RandomForestClassifier(min_samples_leaf=20, random_state=0)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(min_samples_leaf=20, random_state=0)
In [91]:
importances = rf.feature_importances_
feature_names = remaining_features_df.columns[1:-1]
feature_importance_df = pl.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importance_df = feature_importance_df.sort(by='Importance', descending=True)
In [92]:
feature_importance_df
Out[92]:
shape: (11, 2)
FeatureImportance
strf64
"n_connections"0.292084
"std_local_source_degrees"0.267337
"std_global_source_degrees"0.110376
"max_global_dest_degrees"0.077428
"avg_global_source_degrees"0.065096
……
"min_global_source_degrees"0.049449
"avg_global_dest_degrees"0.046397
"max_global_source_degrees"0.037231
"min_local_source_degrees"0.000789
"min_local_dest_degrees"0.0
In [93]:
rf_imp_fig = px.bar(feature_importance_df, x='Feature', y='Importance')
rf_imp_fig.show()
In [71]:
r = permutation_importance(rf, X_test, y_test, n_repeats=50, random_state=0)
In [94]:
perm_importances = r.importances_mean
perm_importance_df = pl.DataFrame({'Feature': feature_names, 'Importance': perm_importances})
perm_importance_df = perm_importance_df.sort(by='Importance', descending=True)
In [96]:
perm_importance_df
Out[96]:
shape: (11, 2)
FeatureImportance
strf64
"n_connections"0.202124
"std_local_source_degrees"0.1541
"std_global_source_degrees"0.056578
"avg_global_source_degrees"0.035044
"max_global_dest_degrees"0.029499
……
"min_global_dest_degrees"0.01351
"avg_global_dest_degrees"0.012802
"min_local_source_degrees"0.003127
"max_global_source_degrees"0.00059
"min_local_dest_degrees"0.000059
In [95]:
perm_fig = px.bar(perm_importance_df, x='Feature', y='Importance')
perm_fig.show()
Observations¶
  • min_local_source_degrees and min_local_dest degrees have the lowest importance scores in the impurity-based feature importance, and are the bottom 2 out of 3 in the permutation importance score
  • most of the predictive features are global
Impact¶
  • min_local_source_degrees and min_local_dest degrees should be dropped
  • a tree-based model should be used to model the relationships between the engineered features
In [80]:
remaining_engineered_features = [f for f in remaining_engineered_features if f not in ['min_local_dest_degrees', 'min_local_source_degrees']]
print('Final engineered featureset:')
print(remaining_engineered_features)
Final engineered featureset:
['std_local_source_degrees', 'avg_global_source_degrees', 'max_global_dest_degrees', 'max_global_source_degrees', 'std_global_source_degrees', 'min_global_source_degrees', 'avg_global_dest_degrees', 'n_connections', 'min_global_dest_degrees']

Feature Engineering Pipeline¶

In [90]:
selected_features = ['std_local_source_degrees', 
                     'avg_global_source_degrees', 
                     'max_global_dest_degrees', 
                     'max_global_source_degrees', 
                     'std_global_source_degrees', 
                     'min_global_source_degrees', 
                     'avg_global_dest_degrees', 
                     'n_connections', 
                     'min_global_dest_degrees']

callsdf = (
    (
        pl.read_json("../data/supervised_call_graphs.json")
        .with_columns(
            pl.col("call_graph").list.eval(
                pl.element().struct.rename_fields(["from", "to"])
            )
        )
        .explode("call_graph")
        .unnest("call_graph")
    )
    .with_columns(
        global_source_degrees=pl.len().over(pl.col("from")),
        global_dest_degrees=pl.len().over(pl.col("to")),
        local_source_degrees=pl.len().over(pl.col("from"), pl.col("_id")),
        local_dest_degrees=pl.len().over(pl.col("to"), pl.col("_id")),
    )
    .pipe(get_graph_features)
    .select(["_id"] + selected_features)
)

pl.read_parquet("../data/clean_data_supervised.parquet").join(
    callsdf, on="_id"
).write_parquet("../data/features_clean_data_supervised.parquet")

Summary¶

Feature Engineering Summary¶

  • 18 new features were engineered, which measured graph and node related features
  • Graph-level features measure the total size of the graphs
  • Node level features measure the degrees on global and local levels
  • 7 features were dropped due to high correlation within group
  • 2 more features were dropped due to low feature importance scores

Implications for ML¶

  • Engineered and selected 9 features could be useful in the prediction task, so they should be included into the final model
  • Feature engineering pipeline was designed, so new data can be easily transformed
In [ ]: